Much of the code and examples are copied/modified from
Blueprints for Text Analytics Using Python by Jens Albrecht, Sidharth Ramachandran, and Christian Winkler (O'Reilly, 2021), 978-1-492-07408-3.
cd ../..
/Users/shanekercheval/repos/nlp-template
%run "source/config/notebook_settings.py"
pd.set_option('display.max_colwidth', None)
from source.library.utilities import Timer, get_logger
from source.library.text_analysis import count_tokens, tf_idf, get_context_from_keyword, count_keywords, count_keywords_by, impurity
from source.library.sklearn_topic_modeling import *
from helpsk.utility import read_pickle
with Timer("Loading Data"):
path = 'artifacts/data/processed/un-general-debates-paragraphs.pkl'
paragraphs = pd.read_pickle(path)
file = 'artifacts/models/topics/nmf-topics-10-ngrams-1-3__vectorizer.pkl'
nmf_vectorizer = read_pickle(file)
file = 'artifacts/models/topics/nmf-topics-10-ngrams-1-3__vectors.pkl'
nmf_vectors = read_pickle(file)
file = f'artifacts/models/topics/nmf-topics-10-ngrams-1-3__model.pkl' # noqa
nmf_model = read_pickle(file)
file = 'artifacts/models/topics/lda-topics-10-ngrams-1-3__vectorizer.pkl'
lda_vectorizer = read_pickle(file)
file = 'artifacts/models/topics/lda-topics-10-ngrams-1-3__vectors.pkl'
lda_vectors = read_pickle(file)
file = f'artifacts/models/topics/lda-topics-10-ngrams-1-3__model.pkl' # noqa
lda_model = read_pickle(file)
Started: Loading Data Finished (0.26 seconds)
This section provides a basic exploration of the text and dataset.
hlp.pandas.numeric_summary(paragraphs)
| # of Non-Nulls | # of Nulls | % Nulls | # of Zeros | % Zeros | Mean | St Dev. | Coef of Var | Skewness | Kurtosis | Min | 10% | 25% | 50% | 75% | 90% | Max | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| year | 279,045 | 0 | 0.0% | 0 | 0.0% | 1,992.4 | 12.6 | 0.0 | 0.1 | -1.1 | 1,970 | 1,975.0 | 1,982.0 | 1,993.0 | 2,003.0 | 2,010.0 | 2,015 |
hlp.pandas.non_numeric_summary(paragraphs)
| # of Non-Nulls | # of Nulls | % Nulls | Most Freq. Value | # of Unique | % Unique | |
|---|---|---|---|---|---|---|
| country | 279,045 | 0 | 0.0% | Russian Federation | 199 | 0.1% |
| text | 279,045 | 0 | 0.0% | The President returned to the [...] | 278,820 | 99.9% |
assert not (paragraphs['text'].str.strip() == '').any()
nmf_feature_names = nmf_vectorizer.get_feature_names_out()
plot_topics(
model=nmf_model,
features=nmf_feature_names,
top_n_tokens=8,
num_tokens_in_label=2
)
plot_topic_sizes(
model=nmf_model,
dataset=nmf_vectors,
features=nmf_feature_names,
)
predicted_topics = nmf_model.transform(X=nmf_vectors)
per_document_totals = predicted_topics.sum(axis=1)
ax = pd.Series(per_document_totals).plot(kind='box', vert=False, figsize=(10, 1))
ax.set_title("Distribution Sum of Predicted Values/Topics Per Document")
ax.set_xlabel("Sum of Predicted Values Per Document")
ax.set_yticklabels([])
ax;
def get_topic_sizes_per_year(model, features, vectorizer):
topic_labels = create_topic_labels(
model=model,
features=features,
token_separator=' | ',
top_n_tokens=2,
)
topic_labels.values()
years = paragraphs['year'].unique()
years.sort()
def get_segment_sizes(paragraphs):
new_data = vectorizer.transform(paragraphs)
sizes = calculate_topic_sizes(model=model, dataset=new_data)
return sizes
sizes_per_year = {year: get_segment_sizes(paragraphs.query(f'year == {year}')['text'])
for year in years}
yearly_dict = {year: {topic: value
for topic, value in zip(topic_labels.values(), sizes_per_year[year])}
for year in years}
df = pd.DataFrame(yearly_dict).reset_index().rename(columns={'index': 'topic_labels'})
column_values = df.columns
df = pd.melt(df, id_vars='topic_labels', value_vars=list(column_values), var_name='year')
return df
topic_sizes_per_year = get_topic_sizes_per_year(
model=nmf_model,
features=nmf_feature_names,
vectorizer=nmf_vectorizer
)
topic_sizes_per_year.head()
| topic_labels | year | value | |
|---|---|---|---|
| 0 | world | peace | 1970 | 0.18 |
| 1 | session | assembly | 1970 | 0.11 |
| 2 | countries | developing | 1970 | 0.10 |
| 3 | rights | human | 1970 | 0.07 |
| 4 | africa | south | 1970 | 0.09 |
fig = px.area(
topic_sizes_per_year,
x="year",
y="value",
color="topic_labels",
title="Topics Over Time",
)
fig.show()
fig = px.bar(
topic_sizes_per_year,
x="year",
y="value",
color="topic_labels",
title="Topics Over Time",
)
fig.show()
fig = px.line(
topic_sizes_per_year,
x="year",
y="value",
color="topic_labels",
title="Topics Over Time",
)
fig.show()
fig = px.scatter(
topic_sizes_per_year,
x="year",
y="value",
color="topic_labels",
trendline="lowess",
opacity=0.0,
title="Topics Over Time",
)
fig.show()
# import pyLDAvis.sklearn
# display = pyLDAvis.sklearn.prepare(nmf_model, nmf_vectors, nmf_vectorizer, sort_topics=False)
# # pyLDAvis.display(lda_display)
# file_name = f"docs/models/nmf-n-grams-{n_gram_range[0]}-{n_gram_range[1]}.html"
# pyLDAvis.save_html(display, file_name)
Neither the book nor the example above uses TF-IDF with LDA, but do not specify why. Both use TF-IDF with NMF and then change to CountVectorizer with LDA
LDA only needs a bag-of-word vector.
lda_feature_names = lda_vectorizer.get_feature_names_out()
plot_topics(
model=lda_model,
features=lda_feature_names,
top_n_tokens=8,
num_tokens_in_label=2,
token_separator=' | '
)
plot_topic_sizes(
model=lda_model,
dataset=lda_vectors,
features=lda_feature_names,
top_n_tokens=3,
token_separator=' | '
)
predicted_topics = lda_model.transform(lda_vectors)
per_document_totals = predicted_topics.sum(axis=1)
ax = pd.Series(per_document_totals).plot(kind='box', vert=False, figsize=(10, 1))
ax.set_title("Distribution Sum of Predicted Values/Topics Per Document")
ax.set_xlabel("Sum of Predicted Values Per Document")
ax.set_yticklabels([])
ax;
topic_sizes_per_year = get_topic_sizes_per_year(
model=lda_model,
features=lda_feature_names,
vectorizer=lda_vectorizer
)
topic_sizes_per_year.head()
| topic_labels | year | value | |
|---|---|---|---|
| 0 | development | countries | 1970 | 0.07 |
| 1 | world | peace | 1970 | 0.09 |
| 2 | south | people | 1970 | 0.12 |
| 3 | council | security | 1970 | 0.07 |
| 4 | economic | countries | 1970 | 0.10 |
fig = px.area(
topic_sizes_per_year,
x="year",
y="value",
color="topic_labels",
title="Topics Over Time",
)
fig.show()
fig = px.bar(
topic_sizes_per_year,
x="year",
y="value",
color="topic_labels",
title="Topics Over Time",
)
fig.show()
fig = px.line(
topic_sizes_per_year,
x="year",
y="value",
color="topic_labels",
title="Topics Over Time",
)
fig.show()
fig = px.scatter(
topic_sizes_per_year,
x="year",
y="value",
color="topic_labels",
trendline="lowess",
opacity=0.0,
title="Topics Over Time",
)
fig.show()
import pyLDAvis.sklearn
lda_display = pyLDAvis.sklearn.prepare(lda_model, lda_vectors, lda_vectorizer, sort_topics=False)
# pyLDAvis.display(lda_display)
file_name = f"docs/models/lda-n-grams-1-3.html"
pyLDAvis.save_html(lda_display, file_name)
/Users/shanekercheval/repos/nlp-template/.venv/lib/python3.9/site-packages/sklearn/utils/deprecation.py:87: FutureWarning: Function get_feature_names is deprecated; get_feature_names is deprecated in 1.0 and will be removed in 1.2. Please use get_feature_names_out instead. /Users/shanekercheval/repos/nlp-template/.venv/lib/python3.9/site-packages/pyLDAvis/_prepare.py:246: FutureWarning: In a future version of pandas all arguments of DataFrame.drop except for the argument 'labels' will be keyword-only. /Users/shanekercheval/repos/nlp-template/.venv/lib/python3.9/site-packages/past/builtins/misc.py:45: DeprecationWarning: the imp module is deprecated in favour of importlib; see the module's documentation for alternative uses from imp import reload /Users/shanekercheval/repos/nlp-template/.venv/lib/python3.9/site-packages/past/builtins/misc.py:45: DeprecationWarning: the imp module is deprecated in favour of importlib; see the module's documentation for alternative uses from imp import reload /Users/shanekercheval/repos/nlp-template/.venv/lib/python3.9/site-packages/past/builtins/misc.py:45: DeprecationWarning: the imp module is deprecated in favour of importlib; see the module's documentation for alternative uses from imp import reload /Users/shanekercheval/repos/nlp-template/.venv/lib/python3.9/site-packages/past/builtins/misc.py:45: DeprecationWarning: the imp module is deprecated in favour of importlib; see the module's documentation for alternative uses from imp import reload /Users/shanekercheval/repos/nlp-template/.venv/lib/python3.9/site-packages/past/builtins/misc.py:45: DeprecationWarning: the imp module is deprecated in favour of importlib; see the module's documentation for alternative uses from imp import reload /Users/shanekercheval/repos/nlp-template/.venv/lib/python3.9/site-packages/past/builtins/misc.py:45: DeprecationWarning: the imp module is deprecated in favour of importlib; see the module's documentation for alternative uses from imp import reload /Users/shanekercheval/repos/nlp-template/.venv/lib/python3.9/site-packages/past/builtins/misc.py:45: DeprecationWarning: the imp module is deprecated in favour of importlib; see the module's documentation for alternative uses from imp import reload /Users/shanekercheval/repos/nlp-template/.venv/lib/python3.9/site-packages/past/builtins/misc.py:45: DeprecationWarning: the imp module is deprecated in favour of importlib; see the module's documentation for alternative uses from imp import reload